our main objective ofthe project is to perform dimensional reduction to help in data visualization
#loading the first dataset to the environment to use in the first session of the project
load_dd<-read.csv("http://bit.ly/CarreFourDataset")
head(load_dd)
## Invoice.ID Branch Customer.type Gender Product.line Unit.price
## 1 750-67-8428 A Member Female Health and beauty 74.69
## 2 226-31-3081 C Normal Female Electronic accessories 15.28
## 3 631-41-3108 A Normal Male Home and lifestyle 46.33
## 4 123-19-1176 A Member Male Health and beauty 58.22
## 5 373-73-7910 A Normal Male Sports and travel 86.31
## 6 699-14-3026 C Normal Male Electronic accessories 85.39
## Quantity Tax Date Time Payment cogs gross.margin.percentage
## 1 7 26.1415 1/5/2019 13:08 Ewallet 522.83 4.761905
## 2 5 3.8200 3/8/2019 10:29 Cash 76.40 4.761905
## 3 7 16.2155 3/3/2019 13:23 Credit card 324.31 4.761905
## 4 8 23.2880 1/27/2019 20:33 Ewallet 465.76 4.761905
## 5 7 30.2085 2/8/2019 10:37 Ewallet 604.17 4.761905
## 6 7 29.8865 3/25/2019 18:30 Ewallet 597.73 4.761905
## gross.income Rating Total
## 1 26.1415 9.1 548.9715
## 2 3.8200 9.6 80.2200
## 3 16.2155 7.4 340.5255
## 4 23.2880 8.4 489.0480
## 5 30.2085 5.3 634.3785
## 6 29.8865 4.1 627.6165
#checking the statistical summary of the dataset
summary(load_dd)
## Invoice.ID Branch Customer.type Gender
## Length:1000 Length:1000 Length:1000 Length:1000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Product.line Unit.price Quantity Tax
## Length:1000 Min. :10.08 Min. : 1.00 Min. : 0.5085
## Class :character 1st Qu.:32.88 1st Qu.: 3.00 1st Qu.: 5.9249
## Mode :character Median :55.23 Median : 5.00 Median :12.0880
## Mean :55.67 Mean : 5.51 Mean :15.3794
## 3rd Qu.:77.94 3rd Qu.: 8.00 3rd Qu.:22.4453
## Max. :99.96 Max. :10.00 Max. :49.6500
## Date Time Payment cogs
## Length:1000 Length:1000 Length:1000 Min. : 10.17
## Class :character Class :character Class :character 1st Qu.:118.50
## Mode :character Mode :character Mode :character Median :241.76
## Mean :307.59
## 3rd Qu.:448.90
## Max. :993.00
## gross.margin.percentage gross.income Rating Total
## Min. :4.762 Min. : 0.5085 Min. : 4.000 Min. : 10.68
## 1st Qu.:4.762 1st Qu.: 5.9249 1st Qu.: 5.500 1st Qu.: 124.42
## Median :4.762 Median :12.0880 Median : 7.000 Median : 253.85
## Mean :4.762 Mean :15.3794 Mean : 6.973 Mean : 322.97
## 3rd Qu.:4.762 3rd Qu.:22.4453 3rd Qu.: 8.500 3rd Qu.: 471.35
## Max. :4.762 Max. :49.6500 Max. :10.000 Max. :1042.65
str(load_dd)
## 'data.frame': 1000 obs. of 16 variables:
## $ Invoice.ID : chr "750-67-8428" "226-31-3081" "631-41-3108" "123-19-1176" ...
## $ Branch : chr "A" "C" "A" "A" ...
## $ Customer.type : chr "Member" "Normal" "Normal" "Member" ...
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Product.line : chr "Health and beauty" "Electronic accessories" "Home and lifestyle" "Health and beauty" ...
## $ Unit.price : num 74.7 15.3 46.3 58.2 86.3 ...
## $ Quantity : int 7 5 7 8 7 7 6 10 2 3 ...
## $ Tax : num 26.14 3.82 16.22 23.29 30.21 ...
## $ Date : chr "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
## $ Time : chr "13:08" "10:29" "13:23" "20:33" ...
## $ Payment : chr "Ewallet" "Cash" "Credit card" "Ewallet" ...
## $ cogs : num 522.8 76.4 324.3 465.8 604.2 ...
## $ gross.margin.percentage: num 4.76 4.76 4.76 4.76 4.76 ...
## $ gross.income : num 26.14 3.82 16.22 23.29 30.21 ...
## $ Rating : num 9.1 9.6 7.4 8.4 5.3 4.1 5.8 8 7.2 5.9 ...
## $ Total : num 549 80.2 340.5 489 634.4 ...
#checking the dataset whether there is any null values
colSums(is.na(load_dd))
## Invoice.ID Branch Customer.type
## 0 0 0
## Gender Product.line Unit.price
## 0 0 0
## Quantity Tax Date
## 0 0 0
## Time Payment cogs
## 0 0 0
## gross.margin.percentage gross.income Rating
## 0 0 0
## Total
## 0
#checking the data in column "gross.margin.percentage"
unique(load_dd$gross.margin.percentage)
## [1] 4.761905
hist(load_dd$gross.margin.percentage, col = "brown")
#selecting numerical variables only
Sales <- load_dd[,c(6,7,8,12,14,15,16)]
head(Sales)
## Unit.price Quantity Tax cogs gross.income Rating Total
## 1 74.69 7 26.1415 522.83 26.1415 9.1 548.9715
## 2 15.28 5 3.8200 76.40 3.8200 9.6 80.2200
## 3 46.33 7 16.2155 324.31 16.2155 7.4 340.5255
## 4 58.22 8 23.2880 465.76 23.2880 8.4 489.0480
## 5 86.31 7 30.2085 604.17 30.2085 5.3 634.3785
## 6 85.39 7 29.8865 597.73 29.8865 4.1 627.6165
library("corrplot")
## corrplot 0.90 loaded
mydata.rcorr = cor(as.matrix(Sales))
mydata.rcorr
## Unit.price Quantity Tax cogs gross.income
## Unit.price 1.000000000 0.01077756 0.6339621 0.6339621 0.6339621
## Quantity 0.010777564 1.00000000 0.7055102 0.7055102 0.7055102
## Tax 0.633962089 0.70551019 1.0000000 1.0000000 1.0000000
## cogs 0.633962089 0.70551019 1.0000000 1.0000000 1.0000000
## gross.income 0.633962089 0.70551019 1.0000000 1.0000000 1.0000000
## Rating -0.008777507 -0.01581490 -0.0364417 -0.0364417 -0.0364417
## Total 0.633962089 0.70551019 1.0000000 1.0000000 1.0000000
## Rating Total
## Unit.price -0.008777507 0.6339621
## Quantity -0.015814905 0.7055102
## Tax -0.036441705 1.0000000
## cogs -0.036441705 1.0000000
## gross.income -0.036441705 1.0000000
## Rating 1.000000000 -0.0364417
## Total -0.036441705 1.0000000
corrplot(mydata.rcorr)
#loading libraries
library(openxlsx)
library(reshape2)
library(plyr)
library(scales)
library(ggplot2)
library("devtools")
## Loading required package: usethis
library(grid)
install_github("vqv/ggbiplot",dependencies = TRUE)
## Skipping install of 'ggbiplot' from a github remote, the SHA1 (7325e880) has not changed since last install.
## Use `force = TRUE` to force installation
#undelecting none zero and non constants
sales_data<-Sales[,apply(Sales, 2,var)!=0]
head(sales_data)
## Unit.price Quantity Tax cogs gross.income Rating Total
## 1 74.69 7 26.1415 522.83 26.1415 9.1 548.9715
## 2 15.28 5 3.8200 76.40 3.8200 9.6 80.2200
## 3 46.33 7 16.2155 324.31 16.2155 7.4 340.5255
## 4 58.22 8 23.2880 465.76 23.2880 8.4 489.0480
## 5 86.31 7 30.2085 604.17 30.2085 5.3 634.3785
## 6 85.39 7 29.8865 597.73 29.8865 4.1 627.6165
#performing PCA on the sales dataset
sale.pca <- prcomp(Sales[,c(1:7)], center = TRUE, scale. = FALSE)
summary(sale.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 340.3819 20.53212 1.71932 1.24589 1.678e-13 7.548e-15
## Proportion of Variance 0.9963 0.00363 0.00003 0.00001 0.000e+00 0.000e+00
## Cumulative Proportion 0.9963 0.99996 0.99999 1.00000 1.000e+00 1.000e+00
## PC7
## Standard deviation 1.78e-15
## Proportion of Variance 0.00e+00
## Cumulative Proportion 1.00e+00
#Then Loading our ggbiplot library
library("ggbiplot")
ggbiplot(sale.pca)
# ploting our pca adding labels
ggbiplot(sale.pca,ellipse=TRUE, labels=rownames(load_dd), obs.scale = 0.01, var.scale = 0.01)
## Dimensionality Reduction(T- SNE)
# Loading our tnse library
library(Rtsne)
require(tsne)
## Loading required package: tsne
trn <- data.matrix(Sales)
cols <- rainbow(10)
require(tsne)
# Here, K is 2, since we use tsne to map the rows to a 2D representation (map).
ecb = function(x, y){ plot(x, t='n'); text(x, labels=trn[,7], col=cols[trn[,7] +1]); }
tsne_res = tsne(trn[,1:7], epoch_callback = ecb, perplexity=50, epoch=50)
## Warning in if (class(X) == "dist") {: the condition has length > 1 and only the
## first element will be used
## sigma summary: Min. : 0.356793818656386 |1st Qu. : 0.43997751392607 |Median : 0.476846570682487 |Mean : 0.484869852342844 |3rd Qu. : 0.519514782797261 |Max. : 0.692793261319319 |
## Epoch: Iteration #50 error is: 13.4902191558145
## Epoch: Iteration #100 error is: 13.5098537997885
## Epoch: Iteration #150 error is: 0.686013287951787
## Epoch: Iteration #200 error is: 0.614330555824014
## Epoch: Iteration #250 error is: 0.591869496225693
## Epoch: Iteration #300 error is: 0.573758313306361
## Epoch: Iteration #350 error is: 0.569254866624733
## Epoch: Iteration #400 error is: 0.567818392183111
## Epoch: Iteration #450 error is: 0.567181582501048
## Epoch: Iteration #500 error is: 0.566844964253547
## Epoch: Iteration #550 error is: 0.566644135988815
## Epoch: Iteration #600 error is: 0.566435025657946
## Epoch: Iteration #650 error is: 0.566350908593689
## Epoch: Iteration #700 error is: 0.566289100625877
## Epoch: Iteration #750 error is: 0.566244578366584
## Epoch: Iteration #800 error is: 0.566211776567411
## Epoch: Iteration #850 error is: 0.566187020264634
## Epoch: Iteration #900 error is: 0.5661679924371
## Epoch: Iteration #950 error is: 0.5661531298016
## Epoch: Iteration #1000 error is: 0.566141509037671
from the above diagram t-sne makes a good job in reducing the dimensions and clustering, the only disadvantage is experinced is that it takes long time to complete its execution